# Initialize required libraries
library(readr)
library(sqldf)
library(knitr)
library(VIM) # for checking missing values
library(dplyr) # for data aggregation
library(tidyr) # for reshaping data tables
library(ggplot2) # for visualization ● Signups.csv
o Uid – unique user_id (also the primary key)
o Auth_type – there are 3 types; A,B,C. Let’s assume this corresponds to whether the user signed up using social login or different email clients (gmail, yahoo etc)
o Device – This is the device from which the user signed up. It has values from 1 – 7, each depicting some device type (eg. PC, ios app, android app etc.)
o Signup_dt – Date when the user signed up
● Visits.csv
o Uid – unique user_id
o Dt – Date when the user visited our Houzz website/app
# Upload the tables into R for the analysis
signups <- read_csv("data/signups.csv")
visits <- read_csv("data/visits.csv")
visits <- visits[, -1] # remove the first column of indices
signups <- signups[, -1] # remove the first column of indicesCheck the missing values in two tables.
aggr(signups, plot = TRUE, prop = T, number = F, label = T, gap = T, only.miss = T)aggr(visits, plot = TRUE, prop = T, number = F, label = T, gap = T, only.miss = T) These two graphs show there are no missings in the above two tables.
length(unique(signups$uid)) == nrow(signups) # check if 'uid' is unique in signups## [1] TRUE
length(unique(visits$uid)) == nrow(visits) # check if 'uid' is unique in visits## [1] FALSE
Check the intersection of users between two tables
# 'Users in signups not in visits'
signups[!(signups$uid %in% visits$uid), ]## # A tibble: 18 × 4
## uid signup_dt auth_type device
## <int> <date> <chr> <int>
## 1 23695907 2016-10-17 A 5
## 2 21888962 2016-06-20 C 5
## 3 23716963 2016-10-19 C 4
## 4 23689756 2016-10-17 C 4
## 5 23718833 2016-10-19 A 5
## 6 23687826 2016-10-17 A 5
## 7 23701247 2016-10-18 A 5
## 8 23724196 2016-10-19 A 5
## 9 23686444 2016-10-17 A 5
## 10 23709678 2016-10-18 C 5
## 11 23696306 2016-10-17 C 5
## 12 23768847 2016-10-22 C 5
## 13 23690854 2016-10-17 C 5
## 14 23676462 2016-10-16 A 5
## 15 23702613 2016-10-18 C 5
## 16 23698102 2016-10-18 A 5
## 17 23685800 2016-10-17 C 4
## 18 23707134 2016-10-18 A 5
# 'Users in visits not in signups'
visits[!(visits$uid %in% signups$uid), ]## # A tibble: 0 × 2
## # ... with 2 variables: uid <int>, dt <date>
This result shows users in visits is a subset of users in signups.
We can use ‘sqldf’ to instead of ‘dplyr’ to implement SQL in R to complete data manipulation.
(a) Devices:
q1_devices <- sqldf('select count(uid) as num_users, device from signups group by device order by num_users')## Loading required package: tcltk
## Warning: Quoted identifiers should have class SQL, use DBI::SQL() if the
## caller performs the quoting.
q1_devices## num_users device
## 1 791 3
## 2 3859 4
## 3 4981 2
## 4 6659 7
## 5 9957 5
## 6 13497 6
## 7 32245 1
q1_authtype <- sqldf('select count(uid) as num_users, auth_type from signups group by auth_type order by num_users')
q1_authtype## num_users auth_type
## 1 13485 B
## 2 18409 A
## 3 40095 C
q1_combine <- sqldf('select count(uid) as num_users, device, auth_type from signups group by device, auth_type')
q1_combine## num_users device auth_type
## 1 6693 1 A
## 2 7267 1 B
## 3 18285 1 C
## 4 1004 2 A
## 5 643 2 B
## 6 3334 2 C
## 7 187 3 A
## 8 309 3 B
## 9 295 3 C
## 10 1007 4 A
## 11 22 4 B
## 12 2830 4 C
## 13 3571 5 A
## 14 59 5 B
## 15 6327 5 C
## 16 3904 6 A
## 17 2998 6 B
## 18 6595 6 C
## 19 2043 7 A
## 20 2187 7 B
## 21 2429 7 C
Build a grouped bar chart to better understand the table.
q1_combine$device <- as.factor(q1_combine$device) # x axis in visualization
g_bar <- ggplot(q1_combine, aes(x = device))
g_bar + geom_bar(aes(y = num_users, fill = auth_type), position = 'dodge', stat = 'identity') +
labs(title = 'Frequency distribution of users by auth type and devices') +
theme_linedraw()In the following questions, I will directly use the ‘dplyr’ instead of ‘sqldf’.
# Step 1: Join two tables
activities <- left_join(signups, visits, by = 'uid')
# Step 2: Convert the visiting date into weeks
activities$visit_week = as.integer(ceiling(difftime(activities$dt, activities$signup_dt, units = 'week')))
# Step 3: Group users by signup_dt and visit_week
act_tmp <- activities %>%
group_by(signup_dt, visit_week) %>%
summarise(num_users = n_distinct(uid))
# Step 4: Reshape the data table (rows: signupdt, columns: visit_week)
cohorts <- spread(act_tmp, key = 'visit_week', value = 'num_users')
cohorts_filter <- cohorts[cohorts$signup_dt <= as.Date('2016-10-30'), c(1:26)]
# Step 5: Convert absolute values to percentages and rename columns (% of signed up users visiting again)
cohort_perct <- data.frame(lapply(cohorts_filter[, -c(1, 2)], function(x) x/cohorts_filter[, 2]))
cohort_perct <- bind_cols(cohorts_filter[, c(1, 2)], cohort_perct) # add first two columns
colnames(cohort_perct) = append(c('Signupdt', '# signed up'), paste('% visiting in week ', 1:(length(cohort_perct[1, ]) - 2), sep = ' ')) # change column names
cohort_perct## Source: local data frame [152 x 26]
## Groups: signup_dt [152]
##
## Signupdt `# signed up` `% visiting in week 1`
## <date> <int> <dbl>
## 1 2016-06-01 400 0.5975000
## 2 2016-06-02 439 0.6674260
## 3 2016-06-03 407 0.6805897
## 4 2016-06-04 436 0.6720183
## 5 2016-06-05 540 0.6240741
## 6 2016-06-06 463 0.6328294
## 7 2016-06-07 440 0.6340909
## 8 2016-06-08 441 0.6507937
## 9 2016-06-09 376 0.6515957
## 10 2016-06-10 387 0.6589147
## # ... with 142 more rows, and 23 more variables: `% visiting in week
## # 2` <dbl>, `% visiting in week 3` <dbl>, `% visiting in week 4` <dbl>,
## # `% visiting in week 5` <dbl>, `% visiting in week 6` <dbl>, `%
## # visiting in week 7` <dbl>, `% visiting in week 8` <dbl>, `% visiting
## # in week 9` <dbl>, `% visiting in week 10` <dbl>, `% visiting in week
## # 11` <dbl>, `% visiting in week 12` <dbl>, `% visiting in week
## # 13` <dbl>, `% visiting in week 14` <dbl>, `% visiting in week
## # 15` <dbl>, `% visiting in week 16` <dbl>, `% visiting in week
## # 17` <dbl>, `% visiting in week 18` <dbl>, `% visiting in week
## # 19` <dbl>, `% visiting in week 20` <dbl>, `% visiting in week
## # 21` <dbl>, `% visiting in week 22` <dbl>, `% visiting in week
## # 23` <dbl>, `% visiting in week 24` <dbl>
Plot the heatmap.
# Load a developed tool for plotting the heatmap
# Author's link: https://github.com/systematicinvestor/SIT
con = gzcon(url('https://github.com/systematicinvestor/SIT/raw/master/sit.gz', 'rb'))
source(con)
close(con)
# Create a matrix for plotting
heatmap_m = as.matrix(cohort_perct[, 3:ncol(cohort_perct)])
colnames(heatmap_m) = paste('% in week ', 1:ncol(heatmap_m), sep = ' ')
rownames(heatmap_m) = as.factor(cohort_perct$Signupdt)
# Plot the heatmap
heatmap_m[] = plota.format(100 * heatmap_m, 0, '', '%') # convert the decimals to percentages
plot.table(heatmap_m, smain = 'Correlation', highlight = TRUE) According to this heatmap, I think week 10 roughly reach a steady state. This is an estimation based on the value change (color change) of retention rate at different weeks. More precisely, the retention rate significantly decreases from around 70% to around 20% in the first 10 weeks. However, from week 10 to week 24, the retention rate changes are very small from around 20% to around 15%. If we prefer more accurate estimation, we should draw line charts to find the fastest changing points of the elbow-shape lines. We can also use the line chart in the following ‘Question3 – Retention by auth types’ to check this answer.
The following chart shows that the retention rate decreases dramatically in first ten weeks after users’ registration. After week 10, the retention rate still decreases, but at a slower and unstable rate.
# Step 1: filter data by date '2016-07-24', '2016-08-18' and aggregate data
q3_act_tmp <- activities %>%
filter(signup_dt %in% as.Date(c('2016-07-24', '2016-08-18'))) %>%
group_by(signup_dt, visit_week, auth_type) %>%
summarise(num_users = n_distinct(uid))# Step 1.5: We can also generate the retention rate table, but this step is not necessary in this question
q3_cohorts <- spread(q3_act_tmp, key = 'visit_week', value = 'num_users')
q3_cohorts_filter <- q3_cohorts[, c(1:27)]
q3_cohort_perct <- data.frame(lapply(q3_cohorts_filter[, -c(1, 2, 3)], function(x) x/q3_cohorts_filter[, 3]))
q3_cohort_perct <- bind_cols(q3_cohorts_filter[, c(1, 2, 3)], q3_cohort_perct) # add first two columns
colnames(q3_cohort_perct) = append(c('Signupdt', 'Auth type','# signed up'), paste('% visiting in week ', 1:(ncol(q3_cohort_perct) - 3), sep = ' ')) # change column names# Step 2: Convert absolute values to percentages s (% of signed up users visiting again)
# In this question, it's more convenient to use data tables without being reshaped, becasue ggplot prefer 'long format' data for visulization
q3_tmp <- filter(q3_act_tmp, visit_week == 0) # get the number of signed up users as denominators
retention_tmp <- left_join(q3_act_tmp, q3_tmp[, -2], by = c("signup_dt", "auth_type")) # add the denominators to the table with the absolute value of retention
retention_tmp$ret_rate <- retention_tmp$num_users.x / retention_tmp$num_users.y # get the retention rate
# Step 3: Fitler data according the requirements of visit_weeks
retention <- retention_tmp[retention_tmp$visit_week > 0 & retention_tmp$visit_week <= 24, c('signup_dt', 'visit_week', 'auth_type', 'ret_rate')]
# Step 4: Plot the line chart
g_line <- ggplot(retention, aes(x = visit_week))
g_line + geom_line(aes(y = ret_rate, color = auth_type)) +
facet_grid(~signup_dt) +
labs(title = 'Retention rate by week in different auth types') +
theme_linedraw() The trend of retention rate is very similar among different auth types. The retention does not significantly vary by auth types.
(Assumption: revisiting on the signed up day is not considered as the first visiting)
# Step 1: remove users' visiting on the registered day and group users by signup_dt and visit_week
q4_act_tmp <- activities %>%
filter(visit_week != 0) %>%
group_by(uid) %>%
filter(visit_week == min(visit_week)) %>%
group_by(signup_dt, visit_week) %>%
summarise(num_users = n_distinct(uid))
# Step 2: reshape the data table to the required format
q4_cohorts <- spread(q4_act_tmp, key = 'visit_week', value = 'num_users')
q4_cohorts[is.na(q4_cohorts)] <- 0
q4_cohorts['# of signed up'] <- rowSums(q4_cohorts[, -1])
q4_cohorts_filter <- q4_cohorts[q4_cohorts$signup_dt <= as.Date('2016-10-30'), c(1:25, ncol(q4_cohorts))]
# Step 3: Convert absolute values to percentages (% of signed up users visiting again)
q4_cohort_perct <- data.frame(lapply(q4_cohorts_filter[, -c(1, ncol(q4_cohorts_filter))], function(x) x/q4_cohorts_filter$`# of signed up`))
q4_cohort_perct <- bind_cols(q4_cohorts_filter[, c(1, ncol(q4_cohorts_filter))], q4_cohort_perct) # add first two columns
colnames(q4_cohort_perct) = append(c('Signupdt', '# signed up'), paste('% visiting first in week ', 1:(ncol(q4_cohort_perct) - 2), sep = ' ')) # change column names
q4_cohort_perct## Source: local data frame [152 x 26]
## Groups: signup_dt [152]
##
## Signupdt `# signed up` `% visiting first in week 1`
## <date> <dbl> <dbl>
## 1 2016-06-01 347 0.6887608
## 2 2016-06-02 386 0.7590674
## 3 2016-06-03 361 0.7673130
## 4 2016-06-04 398 0.7361809
## 5 2016-06-05 477 0.7064990
## 6 2016-06-06 412 0.7111650
## 7 2016-06-07 389 0.7172237
## 8 2016-06-08 387 0.7416021
## 9 2016-06-09 332 0.7379518
## 10 2016-06-10 338 0.7544379
## # ... with 142 more rows, and 23 more variables: `% visiting first in week
## # 2` <dbl>, `% visiting first in week 3` <dbl>, `% visiting first in
## # week 4` <dbl>, `% visiting first in week 5` <dbl>, `% visiting first
## # in week 6` <dbl>, `% visiting first in week 7` <dbl>, `% visiting
## # first in week 8` <dbl>, `% visiting first in week 9` <dbl>, `%
## # visiting first in week 10` <dbl>, `% visiting first in week 11` <dbl>,
## # `% visiting first in week 12` <dbl>, `% visiting first in week
## # 13` <dbl>, `% visiting first in week 14` <dbl>, `% visiting first in
## # week 15` <dbl>, `% visiting first in week 16` <dbl>, `% visiting first
## # in week 17` <dbl>, `% visiting first in week 18` <dbl>, `% visiting
## # first in week 19` <dbl>, `% visiting first in week 20` <dbl>, `%
## # visiting first in week 21` <dbl>, `% visiting first in week 22` <dbl>,
## # `% visiting first in week 23` <dbl>, `% visiting first in week
## # 24` <dbl>
On an avg, what proportion of users don’t come back even after 24 weeks?
avg_proportion <- sum(1 - rowSums(q4_cohort_perct[, -c(1, 2)])) / nrow(q4_cohort_perct)
avg_proportion## [1] 0.01364513